WordCloud of sleep text scores¶
In [12]:
%%bash
sudo apt update
sudo apt install fonts-ipaexfont # for Japanese in wordcloud
Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 InRelease Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB] Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease Fetched 128 kB in 2s (75.4 kB/s) Reading package lists... Building dependency tree... Reading state information... 52 packages can be upgraded. Run 'apt list --upgradable' to see them. Reading package lists... Building dependency tree... Reading state information... fonts-ipaexfont is already the newest version (00401-3ubuntu1). 0 upgraded, 0 newly installed, 0 to remove and 52 not upgraded.
WARNING: apt does not have a stable CLI interface. Use with caution in scripts. W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?) WARNING: apt does not have a stable CLI interface. Use with caution in scripts.
In [13]:
!pip install wordcloud
!pip install japanize-matplotlib # for Japanese in matplotlib graph
Requirement already satisfied: wordcloud in /usr/local/lib/python3.10/dist-packages (1.9.4) Requirement already satisfied: numpy>=1.6.1 in /usr/local/lib/python3.10/dist-packages (from wordcloud) (1.26.4) Requirement already satisfied: pillow in /usr/local/lib/python3.10/dist-packages (from wordcloud) (11.0.0) Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from wordcloud) (3.8.0) Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->wordcloud) (1.3.1) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->wordcloud) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->wordcloud) (4.55.3) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->wordcloud) (1.4.7) Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->wordcloud) (24.2) Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->wordcloud) (3.2.0) Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->wordcloud) (2.8.2) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.17.0) Requirement already satisfied: japanize-matplotlib in /usr/local/lib/python3.10/dist-packages (1.1.3) Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from japanize-matplotlib) (3.8.0) Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->japanize-matplotlib) (1.3.1) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->japanize-matplotlib) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->japanize-matplotlib) (4.55.3) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->japanize-matplotlib) (1.4.7) Requirement already satisfied: numpy<2,>=1.21 in /usr/local/lib/python3.10/dist-packages (from matplotlib->japanize-matplotlib) (1.26.4) Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->japanize-matplotlib) (24.2) Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->japanize-matplotlib) (11.0.0) Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->japanize-matplotlib) (3.2.0) Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->japanize-matplotlib) (2.8.2) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->japanize-matplotlib) (1.17.0)
Import libraries¶
In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import japanize_matplotlib # for Japanese in matplotlib graph
from wordcloud import WordCloud, STOPWORDS
Setup working directory¶
In [15]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Documents/ds2024/dsF1/
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Documents/ds2024/dsF1
Parameters¶
In [16]:
csv_in = 'sleep-text-score-wakati.csv'
Read CSV file¶
In [17]:
df = pd.read_csv(csv_in, sep=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())
(426, 4) <class 'pandas.core.frame.DataFrame'> RangeIndex: 426 entries, 0 to 425 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 text 426 non-null object 1 GPT-4o 426 non-null int64 2 Gemini-1.5-Pro 426 non-null int64 3 Claude-3.5-Sonnet 426 non-null int64 dtypes: int64(3), object(1) memory usage: 13.4+ KB None
| text | GPT-4o | Gemini-1.5-Pro | Claude-3.5-Sonnet | |
|---|---|---|---|---|
| 0 | 就寝 時間 毎日 一定 する | 2 | 2 | 2 |
| 1 | 朝日 積極的 浴びる | 2 | 2 | 2 |
| 2 | 寝室 温度 18 -22度 保つ | 2 | 2 | 2 |
| 3 | 就寝 前 ストレッチ 体 リラックス さ せる | 2 | 2 | 2 |
| 4 | 寝具 定期的 清潔 保つ | 2 | 2 | 2 |
Check the number of documents in each category¶
In [18]:
print(df['Gemini-1.5-Pro'].value_counts().sort_index(ascending=True))
Gemini-1.5-Pro 0 182 1 75 2 169 Name: count, dtype: int64
Generating WordCloud¶
In [19]:
fpath = "/usr/share/fonts/opentype/ipaexfont-gothic/ipaexg.ttf"
In [20]:
sorted_labels = sorted(df['Gemini-1.5-Pro'].unique())
for label in sorted_labels:
text_data = df[df['Gemini-1.5-Pro'] == label]['text'].str.cat(sep=' ')
wc = WordCloud(width=800, height=400, background_color='white',
font_path=fpath).generate(text_data)
plt.figure(figsize=(10, 5))
plt.imshow(wc)
plt.axis('off')
plt.title(f'Word Cloud for Label: {label}')
plt.show()
In [21]:
excluded_words = set(['寝る', '前', '直前', 'する', '就寝', '寝室'])
sorted_labels = sorted(df['Gemini-1.5-Pro'].unique())
for label in sorted_labels:
text_data = df[df['Gemini-1.5-Pro'] == label]['text'].str.cat(sep=' ')
wc = WordCloud(width=800, height=400, background_color='white',
font_path=fpath, stopwords=STOPWORDS.union(excluded_words)).generate(text_data)
plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title(f'Word Cloud for Label: {label}')
plt.show()
In [22]:
excluded_words = set(['寝る', '前', '直前', 'する', '就寝', '寝室', '夜', '見る'])
sorted_labels = sorted(df['Gemini-1.5-Pro'].unique())
for label in sorted_labels:
text_data = df[df['Gemini-1.5-Pro'] == label]['text'].str.cat(sep=' ')
wc = WordCloud(width=800, height=400, background_color='white',
font_path=fpath, stopwords=STOPWORDS.union(excluded_words)).generate(text_data)
plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title(f'Word Cloud for Label: {label}')
plt.show()
In [23]:
excluded_words = set(['寝る', '前', '直前', 'する', '就寝', '寝室', '夜', '見る', '軽い', '楽しむ'])
sorted_labels = sorted(df['Gemini-1.5-Pro'].unique())
for label in sorted_labels:
text_data = df[df['Gemini-1.5-Pro'] == label]['text'].str.cat(sep=' ')
wc = WordCloud(width=800, height=400, background_color='white',
font_path=fpath, stopwords=STOPWORDS.union(excluded_words)).generate(text_data)
plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title(f'Word Cloud for Label: {label}')
plt.show()
In [ ]: